verimiz çok büyük olduğu için %4lük bir örneklem çekerek 400 gözlemlik bir veri seti elde ediyoruz.
library(readxl)
ogveri <- read_excel("C:/Users/SİMAY/Documents/Veri Setleri/ogveri.xlsx")
View(ogveri)
Kategorik değişkenleri tanımlıyoruz
ogveri$`Ship Mode` <- factor(ogveri$`Ship Mode`, levels=c("Second Class","Standard Class","First Class","Same Day"))
ogveri$Segment <- factor(ogveri$Segment, levels=c ("Consumer","Home Office","Corporate"))
ogveri$Country <- factor(ogveri$Country, levels=c ("United States"))
ogveri$Country <- factor(ogveri$Country, levels=c ("United States"))
ogveri$Region <- factor(ogveri$Region, levels=c("West","Central","East","South"))
ogveri$Region <- factor(ogveri$Region, levels=c("West","Central","East","South"))
ogveri$Category <- factor(ogveri$Category, levels=c("Technology","Office Supplies","Furniture"))
ogveri$`Sub-Category` <- as.factor(ogveri$`Sub-Category`)
ogveri$City <- as.factor(ogveri$City)
ogveri$State <- as.factor(ogveri$State)
Quantity <- as.numeric(ogveri$Quantity)
Discount <- as.numeric(ogveri$Discount)
Profit <- as.numeric(ogveri$Profit)
Sales <- as.numeric(ogveri$Sales)
summary(ogveri)
## Ship Mode Segment Country City
## Second Class : 68 Consumer :207 United States:400 New York City: 35
## Standard Class:247 Home Office: 71 Los Angeles : 29
## First Class : 63 Corporate :122 Chicago : 20
## Same Day : 22 San Francisco: 17
## Seattle : 17
## Houston : 16
## (Other) :266
## State Region Category Sub-Category
## California: 81 West :120 Technology : 75 Binders : 62
## New York : 45 Central: 97 Office Supplies:240 Paper : 55
## Texas : 42 East :114 Furniture : 85 Furnishings: 35
## Illinois : 30 South : 69 Phones : 35
## Ohio : 20 Chairs : 34
## Florida : 18 Accessories: 33
## (Other) :164 (Other) :146
## Sales Quantity Discount Profit
## Min. : 1.504 Min. : 1.00 Min. :0.0000 Min. :-3701.893
## 1st Qu.: 14.778 1st Qu.: 2.00 1st Qu.:0.0000 1st Qu.: 1.544
## Median : 48.388 Median : 3.00 Median :0.2000 Median : 7.370
## Mean : 264.304 Mean : 3.85 Mean :0.1678 Mean : 38.284
## 3rd Qu.: 197.519 3rd Qu.: 5.00 3rd Qu.:0.2000 3rd Qu.: 27.750
## Max. :13999.960 Max. :14.00 Max. :0.8000 Max. : 6719.981
##
çektiğimiz örneklemi bilgisayara aktarıyoruz.
library("openxlsx")
## Warning: package 'openxlsx' was built under R version 4.1.3
write.xlsx(ogveri, 'ogveri.xlsx')
rowSums(is.na(ogveri))
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## [371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
colSums(is.na(ogveri))
## Ship Mode Segment Country City State Region
## 0 0 0 0 0 0
## Category Sub-Category Sales Quantity Discount Profit
## 0 0 0 0 0 0
Veride eksik değişken olmadığı için kendimiz yaratıyoruz
data_miss<-ogveri
aa<-sample(1:nrow(data_miss),floor(nrow(data_miss)*0.05))
data_miss$Quantity[aa]<-NA
colSums(is.na(data_miss))
## Ship Mode Segment Country City State Region
## 0 0 0 0 0 0
## Category Sub-Category Sales Quantity Discount Profit
## 0 0 0 20 0 0
View(data_miss)
Mice paketini kullanarak eksik gözlemlerimizin yapısını inceliyoruz.
library(mice)
## Warning: package 'mice' was built under R version 4.1.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
md.pattern(data_miss,rotate.names = TRUE)
## Ship Mode Segment Country City State Region Category Sub-Category Sales
## 380 1 1 1 1 1 1 1 1 1
## 20 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0
## Discount Profit Quantity
## 380 1 1 1 0
## 20 1 1 0 1
## 0 0 20 20
Eksik gözlemlerin “Quantity” değişkeninde 20 adet olduğunu gözlemliyoruz. 380 tane de dolu veri bulunmakta.
Şimdi de aggr fonksiyonu ile eksik gözlemlerin yapısını inceleyelim. Bunun için öncelikle VIM ve ISLR kütüphanelerini import ediyoruz.
library(VIM)
## Warning: package 'VIM' was built under R version 4.1.3
## Zorunlu paket yükleniyor: colorspace
## Zorunlu paket yükleniyor: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.1.3
aggr(data_miss,col=c("navyblue","pink"),numbers=TRUE, sortVars=TRUE, labels=names(data_miss),cex.axis=.7,gap=3,ylab=c("Missing Ratio","Missing Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## Quantity 0.05
## Ship Mode 0.00
## Segment 0.00
## Country 0.00
## City 0.00
## State 0.00
## Region 0.00
## Category 0.00
## Sub-Category 0.00
## Sales 0.00
## Discount 0.00
## Profit 0.00
Eksik gözlemlerin veri setimizin %0,05 kadarlık bir kısmını kapladığını görüyoruz. Eksik gözlemlerin yalnızca “Quantity” değişkeninde bulunduğunu da görebiliyoruz.
Karar ağacı yöntemiyle eksik gözlemlerimizi doldurmayı tercih ediyoruz. Karar ağacı öğrenmesi (decision tree learning) yöntemi, makine öğrenmesi (machine learning) konularından birisidir. Literatürde karar ağacı öğrenmesinin alt yöntemleri olarak kabul edilebilecek sınıflandırma ağacı (classification tree) veya ilkelleştirme ağacı (regression tree ,tahmin ağacı) gibi uygulamaları vardır.
Karar ağacı öğrenmesinde, bir ağaç yapısı oluşturularak ağacın yaprakları seviyesinde sınıf etiketleri ve bu yapraklara giden ve başlangıçtan çıkan kollar ile de özellikler üzerindeki işlemeler ifade edilmektedir.
library(rpart)
data_dt<-data_miss
rtree <- rpart(Quantity ~ Discount + Profit+ Sales, data_dt, method="anova")
library(rattle)
## Warning: package 'rattle' was built under R version 4.1.3
## Zorunlu paket yükleniyor: tibble
## Warning: package 'tibble' was built under R version 4.1.3
## Zorunlu paket yükleniyor: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:VIM':
##
## wine
fancyRpartPlot(rtree,cex=0.5)
data_dt$Quantity <- ifelse(is.na(data_dt$Quantity), predict(rtree,data_dt,type="vector"),data_dt$Quantity)
library(mice)
md.pattern(data_dt,rotate.names = TRUE)
## /\ /\
## { `---' }
## { O O }
## ==> V <== No need for mice. This data set is completely observed.
## \ \|/ /
## `-----'
## Ship Mode Segment Country City State Region Category Sub-Category Sales
## 400 1 1 1 1 1 1 1 1 1
## 0 0 0 0 0 0 0 0 0
## Quantity Discount Profit
## 400 1 1 1 0
## 0 0 0 0
eksik değerleri tamamlanmış data_dt’yi bilgisayarımıza kaydediyoruz
library("openxlsx")
write.xlsx(data_dt, "data_imputed.xlsx")
Rastgeleliği sabitlemek için seed fonksiyonunu kullanıyoruz. Veri setimizin %80’i ile bir eğitim verisi oluşturuyoruz.
set.seed(52685136)
trainIndex <- sample(1:nrow(ogveri), size = round(0.8*nrow(ogveri)), replace=FALSE)
tra <- ogveri[trainIndex,]
tst <- ogveri[-trainIndex,]
library("openxlsx")
write.xlsx(tra, 'train.xlsx')
write.xlsx(tst, 'test.xlsx')
Eğitim verimizi data frame formatına çeviriyoruz. Analizlerimizde bundan sonrası için eğitim verimizi kullanacağız.
tra<-as.data.frame(tra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
glimpse(tra)
## Rows: 320
## Columns: 12
## $ `Ship Mode` <fct> Standard Class, Standard Class, Standard Class, Standar~
## $ Segment <fct> Consumer, Consumer, Consumer, Corporate, Consumer, Corp~
## $ Country <fct> United States, United States, United States, United Sta~
## $ City <fct> Belleville, San Francisco, Columbus, Newark, Huntsville~
## $ State <fct> New Jersey, California, Georgia, Delaware, Texas, Texas~
## $ Region <fct> East, West, South, East, Central, Central, Central, Cen~
## $ Category <fct> Technology, Furniture, Office Supplies, Furniture, Furn~
## $ `Sub-Category` <fct> Accessories, Furnishings, Binders, Chairs, Tables, Stor~
## $ Sales <dbl> 239.970, 16.740, 36.400, 291.100, 211.372, 37.224, 7.99~
## $ Quantity <dbl> 3, 2, 5, 5, 2, 3, 1, 7, 4, 5, 3, 4, 3, 5, 3, 7, 2, 3, 6~
## $ Discount <dbl> 0.0, 0.0, 0.0, 0.0, 0.3, 0.2, 0.2, 0.8, 0.7, 0.0, 0.2, ~
## $ Profit <dbl> 71.9910, 4.3524, 17.1080, 75.6860, -45.2940, 3.7224, 0.~
summary(tra)
## Ship Mode Segment Country City
## Second Class : 52 Consumer :161 United States:320 New York City: 30
## Standard Class:204 Home Office: 59 Los Angeles : 26
## First Class : 46 Corporate :100 Chicago : 17
## Same Day : 18 San Francisco: 14
## Houston : 12
## Philadelphia : 12
## (Other) :209
## State Region Category Sub-Category
## California: 63 West :91 Technology : 64 Binders : 49
## New York : 37 Central:80 Office Supplies:189 Paper : 40
## Texas : 34 East :91 Furniture : 67 Phones : 30
## Illinois : 25 South :58 Accessories: 29
## Florida : 17 Art : 28
## Ohio : 15 Chairs : 27
## (Other) :129 (Other) :117
## Sales Quantity Discount Profit
## Min. : 1.504 Min. : 1.000 Min. :0.0000 Min. :-3701.893
## 1st Qu.: 13.885 1st Qu.: 2.000 1st Qu.:0.0000 1st Qu.: 1.028
## Median : 50.180 Median : 3.000 Median :0.2000 Median : 6.815
## Mean : 228.150 Mean : 3.791 Mean :0.1681 Mean : 20.582
## 3rd Qu.: 194.816 3rd Qu.: 5.000 3rd Qu.:0.2000 3rd Qu.: 26.892
## Max. :5083.960 Max. :14.000 Max. :0.8000 Max. : 1906.485
##
“Discount” değişkenimizi değerler 0’ın altında ise “indirim yok” 0’ın üstüne ise “indirimli” olarak kategorize ediyoruz.
tra$Indirim_kat[tra$Discount <= 0] <- "indirim yok"
tra$Indirim_kat[tra$Discount > 0] <- "indirimli"
tra$Karlilik[tra$Profit >= 0] <- "kar"
tra$Karlilik[tra$Profit < 0] <- "zarar"
tra$Karlilik <- as.factor(tra$Karlilik)
tra$Indirim_kat <- as.factor(tra$Indirim_kat)
summary(tra)
## Ship Mode Segment Country City
## Second Class : 52 Consumer :161 United States:320 New York City: 30
## Standard Class:204 Home Office: 59 Los Angeles : 26
## First Class : 46 Corporate :100 Chicago : 17
## Same Day : 18 San Francisco: 14
## Houston : 12
## Philadelphia : 12
## (Other) :209
## State Region Category Sub-Category
## California: 63 West :91 Technology : 64 Binders : 49
## New York : 37 Central:80 Office Supplies:189 Paper : 40
## Texas : 34 East :91 Furniture : 67 Phones : 30
## Illinois : 25 South :58 Accessories: 29
## Florida : 17 Art : 28
## Ohio : 15 Chairs : 27
## (Other) :129 (Other) :117
## Sales Quantity Discount Profit
## Min. : 1.504 Min. : 1.000 Min. :0.0000 Min. :-3701.893
## 1st Qu.: 13.885 1st Qu.: 2.000 1st Qu.:0.0000 1st Qu.: 1.028
## Median : 50.180 Median : 3.000 Median :0.2000 Median : 6.815
## Mean : 228.150 Mean : 3.791 Mean :0.1681 Mean : 20.582
## 3rd Qu.: 194.816 3rd Qu.: 5.000 3rd Qu.:0.2000 3rd Qu.: 26.892
## Max. :5083.960 Max. :14.000 Max. :0.8000 Max. : 1906.485
##
## Indirim_kat Karlilik
## indirim yok:147 kar :250
## indirimli :173 zarar: 70
##
##
##
##
##
glimpse(tra)
## Rows: 320
## Columns: 14
## $ `Ship Mode` <fct> Standard Class, Standard Class, Standard Class, Standar~
## $ Segment <fct> Consumer, Consumer, Consumer, Corporate, Consumer, Corp~
## $ Country <fct> United States, United States, United States, United Sta~
## $ City <fct> Belleville, San Francisco, Columbus, Newark, Huntsville~
## $ State <fct> New Jersey, California, Georgia, Delaware, Texas, Texas~
## $ Region <fct> East, West, South, East, Central, Central, Central, Cen~
## $ Category <fct> Technology, Furniture, Office Supplies, Furniture, Furn~
## $ `Sub-Category` <fct> Accessories, Furnishings, Binders, Chairs, Tables, Stor~
## $ Sales <dbl> 239.970, 16.740, 36.400, 291.100, 211.372, 37.224, 7.99~
## $ Quantity <dbl> 3, 2, 5, 5, 2, 3, 1, 7, 4, 5, 3, 4, 3, 5, 3, 7, 2, 3, 6~
## $ Discount <dbl> 0.0, 0.0, 0.0, 0.0, 0.3, 0.2, 0.2, 0.8, 0.7, 0.0, 0.2, ~
## $ Profit <dbl> 71.9910, 4.3524, 17.1080, 75.6860, -45.2940, 3.7224, 0.~
## $ Indirim_kat <fct> indirim yok, indirim yok, indirim yok, indirim yok, ind~
## $ Karlilik <fct> kar, kar, kar, kar, zarar, kar, kar, zarar, zarar, kar,~
Aykırı/uç değerleri incelemek için “Quantity” değişkeni üzerinde box-plot grafiğini deniyoruz.
library(ggplot2)
ggplot() +
aes(x = "", y = Quantity) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
Fakat box-plot grafiği bize istediğimiz kadar bilgi ve detay sağlayamamakta. Bu nedenle istatistiksel yöntem olan “Hampel filter” yardımı ile potansiyel aykırı değerleri bulmayı deneyeceğiz.
Frank Rudolf Hampel tarafından literatüre kazandırılan ve popülerleştirilen ortanca mutlak sapma değeri, veri setindeki gözlemlerin ortanca değerden ne kadar uzakta olduğunun ölçüsüdür.
aykırı değerleri belirlemek için alt ve üst eşik değerleri şöyle hesaplanır:
Alt eşik = Ortanca – 3 * (MAD), Üst eşik = Ortanca + 3 * (MAD)
Genel bir ifade olarak, ortanca değerden 3 ortanca mutlak sapma uzaklıkta olan gözlemleri aykırı değer olarak konumlayabilirsiniz.
lower_bound_profit <- median(Profit) - 3 * mad(Profit, constant = 1)
lower_bound_profit
## [1] -24.20725
upper_bound_profit <- median(Profit) + 3 * mad(Profit, constant = 1)
upper_bound_profit
## [1] 38.94725
outlier_ind_profit <- which(Profit < lower_bound_profit | Profit > upper_bound_profit)
outlier_ind_profit
## [1] 1 2 8 9 12 29 31 34 35 36 38 42 43 44 45 50 52 59
## [19] 67 72 76 91 92 93 95 96 97 99 103 105 106 109 110 123 134 135
## [37] 136 137 138 141 146 151 155 156 159 166 174 180 181 191 193 195 198 207
## [55] 210 211 216 222 225 227 229 233 235 238 241 243 247 252 253 254 263 266
## [73] 268 275 280 289 292 294 295 296 299 306 311 313 317 319 320 321 328 329
## [91] 332 337 339 343 345 352 354 361 362 366 371 372 373 377 378 381 384 385
## [109] 387 391 392 394 395 398 399
lower_bound_discount <- median(Discount) - 3 * mad(Discount, constant = 1)
lower_bound_discount
## [1] -0.4
upper_bound_discount <- median(Discount) + 3 * mad(Discount, constant = 1)
upper_bound_discount
## [1] 0.8
outlier_ind_discount <- which(Discount < lower_bound_discount | Discount > upper_bound_discount)
outlier_ind_discount
## integer(0)
lower_bound_sales <- median(Sales) - 3 * mad(Sales, constant = 1)
lower_bound_sales
## [1] -72.836
upper_bound_sales <- median(Sales) + 3 * mad(Sales, constant = 1)
upper_bound_sales
## [1] 169.612
outlier_ind_sales <- which(Sales < lower_bound_sales | Sales > upper_bound_sales)
outlier_ind_sales
## [1] 1 2 8 9 31 34 35 36 38 43 50 52 53 59 65 67 72 75
## [19] 76 77 89 93 95 96 97 105 107 109 110 121 123 124 127 134 135 136
## [37] 137 138 141 146 155 156 179 180 181 195 198 200 201 203 210 211 217 222
## [55] 227 229 235 238 241 243 247 251 252 254 261 266 268 273 275 280 289 290
## [73] 294 295 296 299 302 306 311 313 314 317 320 321 325 328 329 332 335 337
## [91] 349 351 352 354 357 361 364 366 371 377 378 381 384 385 387 391 392 394
## [109] 395 398 399
upper_bound_quantity <- median(Quantity) + 3 * mad(Quantity, constant = 1)
upper_bound_quantity
## [1] 6
lower_bound_quantity <- median(Quantity) - 3 * mad(Quantity, constant = 1)
lower_bound_quantity
## [1] 0
outlier_ind_quantity <- which(Quantity < lower_bound_quantity | Quantity > upper_bound_quantity)
outlier_ind_quantity
## [1] 3 11 26 31 33 34 38 47 48 50 67 75 85 91 109 117 120 122 123
## [20] 135 136 138 148 149 155 166 167 173 176 209 224 246 247 254 255 267 268 270
## [39] 275 280 289 312 328 333 337 347 352 358 361 366 371 378 384 391 392 399
Uç değerler bilgi verici olduğu için eleme yapmamayı seçiyoruz.
hist(tra$Sales, col = "darkgreen")
Sales değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sağa çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.
hist(tra$Profit, col = "green")
Profit değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sola çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.
hist(tra$Discount, col = "yellow")
Discount değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sağa çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.
hist(tra$Quantity, col = "purple")
Quantity değişkenimiizin dağılımının bu grafiğe baktığımız zaman normal dağılıma çok uzak olduğunu gözlemiyoruz. Yüksek derecede sağa çarpıklık bulunmaktadır. İlerleyen adımlarda dönüşüm gerekmektedir.
quantity <- as.numeric(tra$Quantity)
discount <- as.numeric(tra$Discount)
profit <- as.numeric(tra$Profit)
sales <- as.numeric(tra$Sales)
library(ggplot2)
ggplot() +
aes(x = "", y = discount) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
Burda da Discount değişkeninin aykırı değerlere sahip olduğunu ve sağa çarpık olduğunu bir kez daha kutu grafiği sayesinde gözlemliyoruz.
library(ggplot2)
ggplot() +
aes(x = "", y = quantity) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
Burda da Quantity değişkeninin aykırı değerlere sahip olduğunu ve sağa çarpık olduğunu bir kez daha kutu grafiği sayesinde gözlemliyoruz.
library(ggplot2)
ggplot() +
aes(x = "", y = profit) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
Bu grafikte boxplot verinin dağılımından dolayı çok açıklayıcı bir görüntü sağlamamaktadır. Fakat yine de aykırı değerlerin olduğunu ve verininin dağılımının sola çarpık olduğunu gözlemleyebiliyoruz.
library(ggplot2)
ggplot() +
aes(x = "", y = sales) +
geom_boxplot(fill = "#0c4c8a") +
theme_minimal()
Bu grafikte Sales değişkenlerinin birbirinden fazlasıyla değişkenlik gösterdiğini , verinin oldukça sağa çarpık olduğunu ve uç değerlerin çok fazla olduğunu gözlemleyebiliyoruz.
ggplot(tra, aes(sample=Profit))+stat_qq()
qqnorm(tra$Profit)
Profit değişkenimiz iç bükey (konkav) olduğunu bu nedenle sağa çarpık olduğunu diğer grafiklerdeki gibi gözlemleyebiliyoruz. Aynı zamanda aykırı değerlerin varlığı bu grafikte de aşikardır.
ggplot(tra, aes(sample=Discount))+stat_qq()
qqnorm(tra$Discount)
ggplot(tra, aes(sample=Sales))+stat_qq()
qqnorm(tra$Sales)
Dış bükey (konveks) olduğundan dağılımın sola çarpık olduğunu söyleyebiliriz. Aynı zamanda uç değerlerin varlığı da aşikardır.
ggplot(tra, aes(sample=Quantity))+stat_qq()
qqnorm(tra$Quantity)
Buradan quantity değişkeninin kesikli değişken olduğunu görüyoruz.
cor_tra<-tra[,c(9,10,11,12)]
library(GGally)
## Warning: package 'GGally' was built under R version 4.1.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
cor(cor_tra)
## Sales Quantity Discount Profit
## Sales 1.00000000 0.33806435 -0.01139695 0.2699386
## Quantity 0.33806435 1.00000000 -0.06993926 0.0575779
## Discount -0.01139695 -0.06993926 1.00000000 -0.2540963
## Profit 0.26993861 0.05757790 -0.25409629 1.0000000
plot(cor_tra)
ggpairs(cor_tra)
p <- GGally::ggpairs(tra[,c(1:3,6)], aes(color = tra$Indirim_kat))
p
p <- GGally::ggpairs(tra[,c(1:3,6)], aes(color = tra$Region))
p
Değişken türlerine göre incelemeler
library(funModeling)
## Warning: package 'funModeling' was built under R version 4.1.3
## Zorunlu paket yükleniyor: Hmisc
## Warning: package 'Hmisc' was built under R version 4.1.3
## Zorunlu paket yükleniyor: lattice
## Warning: package 'lattice' was built under R version 4.1.3
## Zorunlu paket yükleniyor: survival
## Zorunlu paket yükleniyor: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## funModeling v.1.9.4 :)
## Examples and tutorials at livebook.datascienceheroes.com
## / Now in Spanish: librovivodecienciadedatos.ai
##
## Attaching package: 'funModeling'
## The following object is masked from 'package:GGally':
##
## range01
profiling_num(tra)#niceller uzerinden
## variable mean std_dev variation_coef p_01 p_05 p_25
## 1 Sales 228.149556 517.1744894 2.2668222 2.43024 4.56030 13.8850
## 2 Quantity 3.790625 2.2887019 0.6037796 1.00000 1.00000 2.0000
## 3 Discount 0.168125 0.2161205 1.2854751 0.00000 0.00000 0.0000
## 4 Profit 20.582392 253.5346654 12.3180372 -261.01925 -54.71581 1.0285
## p_50 p_75 p_95 p_99 skewness kurtosis iqr
## 1 50.1800 194.8165 1160.6060 2512.9046 4.855860 34.416659 180.9315
## 2 3.0000 5.0000 8.0000 12.4300 1.444466 5.671234 3.0000
## 3 0.2000 0.2000 0.7000 0.8000 1.587077 4.932676 0.2000
## 4 6.8154 26.8918 135.1641 614.1018 -8.431360 156.294705 25.8633
## range_98 range_80
## 1 [2.43024, 2512.90456] [6.7384, 541.41]
## 2 [1, 12.43] [2, 7]
## 3 [0, 0.8] [0, 0.5]
## 4 [-261.019248, 614.10175] [-20.1558, 87.75887]
plot_num(tra)#niceller
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
freq(tra)#kategorikler
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Ship.Mode frequency percentage cumulative_perc
## 1 Standard Class 204 63.75 63.75
## 2 Second Class 52 16.25 80.00
## 3 First Class 46 14.38 94.38
## 4 Same Day 18 5.62 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Segment frequency percentage cumulative_perc
## 1 Consumer 161 50.31 50.31
## 2 Corporate 100 31.25 81.56
## 3 Home Office 59 18.44 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Country frequency percentage cumulative_perc
## 1 United States 320 100 100
## Warning in freq_logic(data = data, input = input[i], plot, na.rm, path_out =
## path_out): Skipping plot for variable 'City' (more than 100 categories)
## City frequency percentage cumulative_perc
## 1 New York City 30 9.38 9.38
## 2 Los Angeles 26 8.12 17.50
## 3 Chicago 17 5.31 22.81
## 4 San Francisco 14 4.38 27.19
## 5 Houston 12 3.75 30.94
## 6 Philadelphia 12 3.75 34.69
## 7 Seattle 12 3.75 38.44
## 8 Columbus 7 2.19 40.63
## 9 San Diego 7 2.19 42.82
## 10 Dallas 6 1.88 44.70
## 11 Columbia 5 1.56 46.26
## 12 Jacksonville 5 1.56 47.82
## 13 Detroit 4 1.25 49.07
## 14 Newark 4 1.25 50.32
## 15 Springfield 4 1.25 51.57
## 16 Atlanta 3 0.94 52.51
## 17 Aurora 3 0.94 53.45
## 18 Austin 3 0.94 54.39
## 19 Concord 3 0.94 55.33
## 20 Huntsville 3 0.94 56.27
## 21 Long Beach 3 0.94 57.21
## 22 Plantation 3 0.94 58.15
## 23 Quincy 3 0.94 59.09
## 24 San Antonio 3 0.94 60.03
## 25 Arlington 2 0.62 60.65
## 26 Brentwood 2 0.62 61.27
## 27 Buffalo 2 0.62 61.89
## 28 Clinton 2 0.62 62.51
## 29 Fairfield 2 0.62 63.13
## 30 Fort Lauderdale 2 0.62 63.75
## 31 Glendale 2 0.62 64.37
## 32 Hialeah 2 0.62 64.99
## 33 Jackson 2 0.62 65.61
## 34 Louisville 2 0.62 66.23
## 35 Marion 2 0.62 66.85
## 36 Milwaukee 2 0.62 67.47
## 37 Nashville 2 0.62 68.09
## 38 Pasadena 2 0.62 68.71
## 39 Pembroke Pines 2 0.62 69.33
## 40 Richmond 2 0.62 69.95
## 41 Saint Charles 2 0.62 70.57
## 42 Skokie 2 0.62 71.19
## 43 Tallahassee 2 0.62 71.81
## 44 Toledo 2 0.62 72.43
## 45 Tucson 2 0.62 73.05
## 46 Washington 2 0.62 73.67
## 47 Appleton 1 0.31 73.98
## 48 Avondale 1 0.31 74.29
## 49 Bakersfield 1 0.31 74.60
## 50 Baltimore 1 0.31 74.91
## 51 Bangor 1 0.31 75.22
## 52 Baytown 1 0.31 75.53
## 53 Belleville 1 0.31 75.84
## 54 Boynton Beach 1 0.31 76.15
## 55 Bristol 1 0.31 76.46
## 56 Carlsbad 1 0.31 76.77
## 57 Carol Stream 1 0.31 77.08
## 58 Carrollton 1 0.31 77.39
## 59 Cary 1 0.31 77.70
## 60 Charlotte 1 0.31 78.01
## 61 Colorado Springs 1 0.31 78.32
## 62 Cuyahoga Falls 1 0.31 78.63
## 63 Dearborn 1 0.31 78.94
## 64 Denver 1 0.31 79.25
## 65 Des Plaines 1 0.31 79.56
## 66 Dublin 1 0.31 79.87
## 67 Encinitas 1 0.31 80.18
## 68 Everett 1 0.31 80.49
## 69 Florence 1 0.31 80.80
## 70 Fort Worth 1 0.31 81.11
## 71 Freeport 1 0.31 81.42
## 72 Georgetown 1 0.31 81.73
## 73 Grand Prairie 1 0.31 82.04
## 74 Great Falls 1 0.31 82.35
## 75 Greensboro 1 0.31 82.66
## 76 Haltom City 1 0.31 82.97
## 77 Henderson 1 0.31 83.28
## 78 Hendersonville 1 0.31 83.59
## 79 Hickory 1 0.31 83.90
## 80 Huntington Beach 1 0.31 84.21
## 81 Indianapolis 1 0.31 84.52
## 82 Johnson City 1 0.31 84.83
## 83 Kent 1 0.31 85.14
## 84 Knoxville 1 0.31 85.45
## 85 Lakeville 1 0.31 85.76
## 86 Lakewood 1 0.31 86.07
## 87 Lawrence 1 0.31 86.38
## 88 Lawton 1 0.31 86.69
## 89 Little Rock 1 0.31 87.00
## 90 Mason 1 0.31 87.31
## 91 Mcallen 1 0.31 87.62
## 92 Meriden 1 0.31 87.93
## 93 Mesquite 1 0.31 88.24
## 94 Missouri City 1 0.31 88.55
## 95 Monroe 1 0.31 88.86
## 96 Moreno Valley 1 0.31 89.17
## 97 Newport News 1 0.31 89.48
## 98 Orange 1 0.31 89.79
## 99 Pasco 1 0.31 90.10
## 100 Passaic 1 0.31 90.41
## 101 Phoenix 1 0.31 90.72
## 102 Pine Bluff 1 0.31 91.03
## 103 Plainfield 1 0.31 91.34
## 104 Pompano Beach 1 0.31 91.65
## 105 Providence 1 0.31 91.96
## 106 Rancho Cucamonga 1 0.31 92.27
## 107 Revere 1 0.31 92.58
## 108 Rochester 1 0.31 92.89
## 109 Rockford 1 0.31 93.20
## 110 Rockville 1 0.31 93.51
## 111 Roseville 1 0.31 93.82
## 112 Sacramento 1 0.31 94.13
## 113 Saint Petersburg 1 0.31 94.44
## 114 Salem 1 0.31 94.75
## 115 San Bernardino 1 0.31 95.06
## 116 Sandy Springs 1 0.31 95.37
## 117 Santa Barbara 1 0.31 95.68
## 118 Sheboygan 1 0.31 95.99
## 119 Smyrna 1 0.31 96.30
## 120 Sunnyvale 1 0.31 96.61
## 121 Thornton 1 0.31 96.92
## 122 Tigard 1 0.31 97.23
## 123 Troy 1 0.31 97.54
## 124 Twin Falls 1 0.31 97.85
## 125 Utica 1 0.31 98.16
## 126 West Palm Beach 1 0.31 98.47
## 127 Westland 1 0.31 98.78
## 128 Wheeling 1 0.31 99.09
## 129 Wilmington 1 0.31 99.40
## 130 Yonkers 1 0.31 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## State frequency percentage cumulative_perc
## 1 California 63 19.69 19.69
## 2 New York 37 11.56 31.25
## 3 Texas 34 10.62 41.87
## 4 Illinois 25 7.81 49.68
## 5 Florida 17 5.31 54.99
## 6 Ohio 15 4.69 59.68
## 7 Washington 13 4.06 63.74
## 8 Pennsylvania 12 3.75 67.49
## 9 Georgia 8 2.50 69.99
## 10 Michigan 8 2.50 72.49
## 11 North Carolina 8 2.50 74.99
## 12 Virginia 7 2.19 77.18
## 13 Arizona 6 1.88 79.06
## 14 Colorado 5 1.56 80.62
## 15 Kentucky 5 1.56 82.18
## 16 Maryland 5 1.56 83.74
## 17 New Jersey 5 1.56 85.30
## 18 Tennessee 5 1.56 86.86
## 19 Delaware 4 1.25 88.11
## 20 Massachusetts 4 1.25 89.36
## 21 Wisconsin 4 1.25 90.61
## 22 Connecticut 3 0.94 91.55
## 23 Indiana 3 0.94 92.49
## 24 Missouri 3 0.94 93.43
## 25 South Carolina 3 0.94 94.37
## 26 Alabama 2 0.62 94.99
## 27 Arkansas 2 0.62 95.61
## 28 District of Columbia 2 0.62 96.23
## 29 Minnesota 2 0.62 96.85
## 30 Idaho 1 0.31 97.16
## 31 Louisiana 1 0.31 97.47
## 32 Maine 1 0.31 97.78
## 33 Montana 1 0.31 98.09
## 34 New Hampshire 1 0.31 98.40
## 35 New Mexico 1 0.31 98.71
## 36 Oklahoma 1 0.31 99.02
## 37 Oregon 1 0.31 99.33
## 38 Rhode Island 1 0.31 99.64
## 39 West Virginia 1 0.31 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Region frequency percentage cumulative_perc
## 1 West 91 28.44 28.44
## 2 East 91 28.44 56.88
## 3 Central 80 25.00 81.88
## 4 South 58 18.12 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Category frequency percentage cumulative_perc
## 1 Office Supplies 189 59.06 59.06
## 2 Furniture 67 20.94 80.00
## 3 Technology 64 20.00 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Sub.Category frequency percentage cumulative_perc
## 1 Binders 49 15.31 15.31
## 2 Paper 40 12.50 27.81
## 3 Phones 30 9.38 37.19
## 4 Accessories 29 9.06 46.25
## 5 Art 28 8.75 55.00
## 6 Chairs 27 8.44 63.44
## 7 Furnishings 26 8.12 71.56
## 8 Storage 26 8.12 79.68
## 9 Labels 16 5.00 84.68
## 10 Appliances 13 4.06 88.74
## 11 Tables 11 3.44 92.18
## 12 Fasteners 7 2.19 94.37
## 13 Envelopes 5 1.56 95.93
## 14 Machines 5 1.56 97.49
## 15 Supplies 5 1.56 99.05
## 16 Bookcases 3 0.94 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Indirim_kat frequency percentage cumulative_perc
## 1 indirimli 173 54.06 54.06
## 2 indirim yok 147 45.94 100.00
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Karlilik frequency percentage cumulative_perc
## 1 kar 250 78.12 78.12
## 2 zarar 70 21.88 100.00
## [1] "Variables processed: Ship.Mode, Segment, Country, City, State, Region, Category, Sub.Category, Indirim_kat, Karlilik"
Değişkenlerin hiçbiri normal dağılmamıştır. İlerleyen bölümlerde dönüşüm uygulamamız gerekmektedir.
Ship Mode değişkeninde en çok kullanılan yöntem standart Class olurken en az kullanılan yöntem same day yöntemidir.
Segment içerisinde de en çok alanı consumer segmenti kapsamaktadır.
Tek ülke united states
En çok California eyaletinde işlem yapılmıştır.
Region değişkeninde ise west ve east eşittir.
Category değişkeninde en çok Office Supplies işlem görmüştür.
Sub-Categoryde ise Binders
Genel olarak indirim yapıldığını görüyoruz.
Zarardan çok kar edildiğini gözlemliyoruz.
Kategorik degiskenin duzeyleri bazında, nicel degıskenlerın ozet istatistiklerii
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(dplyr)
df <- dplyr:: select(tra, Profit, Indirim_kat)
describeBy(df, (df$Indirim_kat))
##
## Descriptive statistics by group
## group: indirim yok
## vars n mean sd median trimmed mad min max range skew
## Profit 1 147 53.36 119.48 13.79 24.85 15.75 0.41 648.56 648.16 3.81
## Indirim_kat* 2 147 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN
## kurtosis se
## Profit 14.35 9.85
## Indirim_kat* NaN 0.00
## ------------------------------------------------------------
## group: indirimli
## vars n mean sd median trimmed mad min max
## Profit 1 173 -7.27 324.66 2.64 2.77 15.83 -3701.89 1906.48
## Indirim_kat* 2 173 2.00 0.00 2.00 2.00 0.00 2.00 2.00
## range skew kurtosis se
## Profit 5608.38 -7.33 100.95 24.68
## Indirim_kat* 0.00 NaN NaN 0.00
library(ggplot2)
ggplot(tra, aes(x=Category,y=Profit, fill=Category))+
geom_boxplot()+
stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1)
library(ggplot2)
ggplot(tra, aes(x=`Sub-Category`,y=Profit, fill=`Sub-Category`))+
geom_boxplot()+
stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1)
library(ggplot2)
ggplot(tra, aes(x=Region,y=Profit, fill=Region))+
geom_boxplot()+
stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1)
library(ggplot2)
ggplot(tra, aes(x=Segment,y=Profit, fill=Segment))+
geom_boxplot()+
stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1)
library(ggplot2)
ggplot(tra, aes(x=`Ship Mode`,y=Profit, fill=`Ship Mode`))+
geom_boxplot()+
stat_summary(fun = median, geom="line", group= 1, color= "black", size = 1)
Kutu grafikleri sağlıklı sonuçlar vermedi. Verilerin kendi içindeki veya birbirleriyle olan ilişkisindeki çarpıklık gibi bir problemden dolayı olabilir.
library(aplpack)
library(dplyr)
new_data<-tra%>%
group_by(Region) %>%
dplyr::summarize(mean_profit = mean(Profit),mean_saless = mean(Sales),mean_discountt = mean(Discount))
faces(new_data[,-1], labels=as.character(new_data$Region))
## effect of variables:
## modified item Var
## "height of face " "mean_profit"
## "width of face " "mean_saless"
## "structure of face" "mean_discountt"
## "height of mouth " "mean_profit"
## "width of mouth " "mean_saless"
## "smiling " "mean_discountt"
## "height of eyes " "mean_profit"
## "width of eyes " "mean_saless"
## "height of hair " "mean_discountt"
## "width of hair " "mean_profit"
## "style of hair " "mean_saless"
## "height of nose " "mean_discountt"
## "width of nose " "mean_profit"
## "width of ear " "mean_saless"
## "height of ear " "mean_discountt"
En çok kar Batıda gerçekleşmiştir.
data_sorted <- tra[order(-tra$Profit),]
Veriyi dilimleme
library(ggplot2)
data_sorted$group <- as.numeric(cut_number(as.numeric(rownames(data_sorted)), 10))
library(dplyr)
data_star<-data_sorted %>%
group_by(group) %>%
dplyr::summarize(Satis= mean(Sales), Miktar= mean(Quantity),Fev= mean(Profit))
stars(data_star[,-1], key.loc = c(15,1.25),main = "Starplot",label=row.names(data_star),cex=.7)
Kümeleme yapmak istersek 10 ve 8 birbirine çok yakın. 2 ve 9 birbirine çok yakın.
3 Nokta Özeti
n<-nrow(tra)
train_sorted <- tra[order(tra$Sales),]
a<-(n/2)
b<-(n/2)+1
(train_sorted$Sales[a]+train_sorted$Sales[b])/2
## [1] 50.18
median(tra$Sales)
## [1] 50.18
mean(tra$Sales)
## [1] 228.1496
hist(tra$Sales)
Sales değişkeninde çarpıklık bulunmakta
5 Nokta özeti
fivenum(tra$Sales)
## [1] 1.504 13.870 50.180 195.281 5083.960
stdev<-sd(tra$Sales)
mean<-mean(tra$Sales)
Degisim_kats_sales<-(stdev/mean)*100
sort <- tra[order(tra$Sales),]
medianf<-median(tra$Sales)
sort$fmed<-abs(sort$Sales-medianf)
sort2 <- sort[order(sort$fmed),]
mad<-median(sort2$fmed)
Sol kuyruk
sol <- function(x) {
c(quantile(x, probs = 1/2) ,
quantile(x, probs = 1/4),
quantile(x, probs =1/8 ),
quantile(x,probs=1/16),
quantile(x,probs=1/32),
quantile(x,probs=1/64)
)
}
Sağ kuyruk
sag <- function(x) {
c(quantile(x, probs = 1/2) ,
quantile(x, probs = 3/4),
quantile(x, probs = 7/8),
quantile(x,probs=15/16),
quantile(x,probs=31/32),
quantile(x,probs=63/64)
)
}
x_a<-sol(tra$Profit)
x_u<-sag(tra$Profit)
x_mrg<-as.data.frame(cbind(x_a,x_u))
rownames(x_mrg)<-c("1/2","1/4","1/8","1/16","1/32","1/64")
colnames(x_mrg)<-c("Alt_Kuyruk","Ust_Kuyruk")
x_mrg$orta_nokta<-(x_mrg$Alt_Kuyruk+x_mrg$Ust_Kuyruk)/2
x_mrg
## Alt_Kuyruk Ust_Kuyruk orta_nokta
## 1/2 6.81540 6.81540 6.81540
## 1/4 1.02850 26.89180 13.96015
## 1/8 -10.04861 76.57651 33.26395
## 1/16 -38.38016 115.60927 38.61456
## 1/32 -84.32235 227.98207 71.82986
## 1/64 -133.18305 544.98190 205.89943
hist(tra$Profit)
Profit değişkeninin de çarpık olduğunu görebiliyoruz.
p<-0.1
mean(tra$Profit, trim = p)
## [1] 14.51672
#Kalan gozlem sayısı hesaplanmak istenirse:
n<-nrow(tra)
ks<-n-(as.integer(2*p*n))
ks
## [1] 256
Ortalama değerimiz olan 14.51 ’e karşılık gelen 256 gözlemimiz var
library("psych")
geometric.mean(tra$Sales)
## [1] 56.07413
freq <- as.data.frame(table(tra$Category))
names(freq)[1] <- 'Kategori'
gini <- function(a,b) {
a1 <- (a/(a+b))**2
b1 <- (b/(a+b))**2
x<-1-(a1 + b1)
return(x)
}
gn<-gini(freq[1,2],freq[2,2])
k<-2
gn/((k-1)/k)
## [1] 0.7558937
freq <- as.data.frame(table(tra$`Indirim_kat`))
names(freq)[1] <- 'Kategori'
gini <- function(a,b) {
a1 <- (a/(a+b))**2
b1 <- (b/(a+b))**2
x<-1-(a1 + b1)
return(x)
}
gn<-gini(freq[1,2],freq[2,2])
k<-2
gn/((k-1)/k)
## [1] 0.9933984
freq <- as.data.frame(table(tra$Region))
names(freq)[1] <- 'Kategori'
gini <- function(a,b) {
a1 <- (a/(a+b))**2
b1 <- (b/(a+b))**2
x<-1-(a1 + b1)
return(x)
}
gn<-gini(freq[1,2],freq[2,2])
k<-2
gn/((k-1)/k)
## [1] 0.995862
entropy<-function(base,a,b) {
var <- abs(((a)/(a+b))*log(((a)/(a+b)),base))-(((b)/(a+b))*log(((b)/(a+b)),base))
return(var)
}
ent<-entropy(10,freq[1,2],freq[2,2])
k<-2
ent/(log(k,10))
## [1] 0.997013
Entropi değeri oldukça yüksek çıkmıştır. Değişkenlik çoktur.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks mice::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x Hmisc::src() masks dplyr::src()
## x Hmisc::summarize() masks dplyr::summarize()
ggplot(tra, aes(Sales,Profit))+
geom_point(size=2,shape=21,stroke=1,color="dodgerblue1", fill="white")+
geom_smooth(method = "lm", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
“Sales” değişkeni ile “Profit” değişkeni arasında doğrusal olmama problemi olduğunu söyleyebiliriz. Satışın artmasıyla karın arttığını gözlemleyebiliyoruz. İki değişkenin ilişkisi ile çıkarılan grafikte olası iki adet aykırı değer olabilecek değer olduğunu görüyoruz. Şu an için çıkarmamayı tercih ediyoruz. İlerleyen süreçlerde model içerisinde problem çıkarırlarsa o zaman bu değişkenler hakkında tekrar düşünülebilir.
ggplot(tra,aes(x=Sales,y=Profit))+
geom_point(size=1)+
geom_text(label=rownames(tra),nudge_x=0.25,nudge_y=0.25, check_overlap=T)+
geom_smooth(method=lm,col="red",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
library(ggExtra)
## Warning: package 'ggExtra' was built under R version 4.1.3
gr<-ggplot(tra,aes(x=Sales,y=Profit))+
geom_point()+
geom_text(size=3,label=rownames(tra),nudge_x=0.25,
nudge_y=0.25, check_overlap=T)+
geom_smooth(method=lm,col="brown1", se=FALSE)
ggMarginal(gr,type="histogram",fill="darksalmon")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Daha detaylı bilgi sahibi olmak için bar plot grafiği de ekledik. Verilerin normal dağılmadığını gözlemliyoruz.
Yukarıda aykırı değer olabileceği hakkında söz edilen değerlerin 181 ve 187 değerleri olduğunu görüyoruz.
ggplot(tra,aes(x=Discount,y=Profit))+
geom_point(size=1)+
geom_text(label=rownames(tra),nudge_x=0.25,nudge_y=0.25, check_overlap=T)+
geom_smooth(method=lm,col="red",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
Discount değişkeni için aykırı değer olabilecek 181 ve 187 değerleri vardır ve dönüşüme ihtiyaç vardır.
ggplot(tra,aes(x=Quantity,y=Profit))+
geom_point(size=1)+
geom_text(label=rownames(tra),nudge_x=0.25,nudge_y=0.25, check_overlap=T)+
geom_smooth(method=lm,col="red",se=FALSE)
## `geom_smooth()` using formula 'y ~ x'
Quantity değişkeni için aykırı değer olabilecek yine 181 ve 187 değerleri vardır ve dönüşüme ihtiyaç vardır.
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:Hmisc':
##
## subplot
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
d_plot <- ggplot(tra, aes(Discount, Profit, fill=Region, shape=Region)) +
geom_point(position = position_jitter(width= 0.2, height = 0), size = 2)
ggplotly(d_plot)
İndirim sağlamanın genel olarak kara çok etki etmediğini görüyoruz. Yalnızca batı bölgesinde 20% lik bir indirim sonrası çok büyük bir kara ulaşılmış ve Merkezde yapılan 80% lik bir indirim sonrası da çok büyük bir zarara uğranmıştır. Bunlar aykırı değer olabilirler. Fakat etkili değer olabileceklerinden verimizde tutmaya devam ediyoruz.
library(ggplot2)
ggplot(tra, aes(Sales,Profit, color=Discount, size=Discount))+
geom_point(alpha=0.5, stroke=2)+
scale_size(range = c(1, 8))+
scale_color_gradient(low = "blue", high = "lightpink")
ggplot(tra,aes(x=Discount,y=Profit))+
geom_hex(bins=20, color = "white")+
scale_fill_gradient(low="mistyrose2", high="violetred3")
Aralarında doğrusal olmama problemi olduğunu söyleyebiliriz. Gözlemler en çok indirimin ve karın 0 olduğu yerde bulunmaktadır.
ggplot(tra, aes(x=Sales, y=Profit) ) +
stat_density_2d(aes(fill = ..level..), geom = "polygon")+
scale_fill_distiller(palette = "Blues")
“Profit” değişkeni ve “Sales” değişkeni arasında değişen varyanslılık problemi olduğunu söyleyebiliriz.
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
corrplot(cor(tra[,9:12]), method = "ellipse")
corrplot.mixed(cor(tra[,9:12]), lower = "number",
upper = "square",tl.col = "black")
Korelasyon haritasına bakarak nicel değişkenlerin birbirleriyle ilişkisinin çok yüksek olmadığını söyleyebiliriz.
Bölgelere göre Kar üzerinden ortanca ve DAG değerlerini bulalım.
library(dplyr)
a<-tra %>% group_by(Region) %>%
dplyr:: summarize(Q1=quantile (Profit, probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
a
## # A tibble: 4 x 5
## Region Q1 Median Q3 DAG
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 West 3.53 8.88 26.8 23.3
## 2 Central -19.2 2.93 13.2 32.5
## 3 East 1.66 9.33 36.9 35.2
## 4 South 2.03 7.31 32.6 30.6
Kategorilere göre Kar üzerinden ortanca ve DAG değerlerini bulalım.
library(dplyr)
b<-tra %>%group_by(Category) %>%
dplyr:: summarize(Q1=quantile (Profit, probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
b
## # A tibble: 3 x 5
## Category Q1 Median Q3 DAG
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Technology 5.51 29.4 70.9 65.4
## 2 Office Supplies 1.80 5.66 14.8 13.0
## 3 Furniture -25.2 6.22 28.0 53.2
Taşıma şekline göre Kar üzerinden ortanca ve DAG değerlerini bulalım.
library(dplyr)
c<-tra %>%group_by(`Ship Mode` , .drop = FALSE) %>%
dplyr:: summarize(Q1=quantile (Profit , probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
c
## # A tibble: 4 x 5
## `Ship Mode` Q1 Median Q3 DAG
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Second Class 2.80 19.0 47.5 44.7
## 2 Standard Class 0.540 6.22 25.3 24.8
## 3 First Class 1.19 7.14 14.9 13.7
## 4 Same Day -21.9 7.19 20.7 42.6
Alt kategorilere göre Kar üzerinden ortanca ve DAG değerlerini bulalım.
library(dplyr)
d<-tra %>%group_by( `Sub-Category`) %>%
dplyr:: summarize(Q1=quantile (Profit, probs=0.25), Median=quantile (Profit, probs=0.50), Q3=quantile(Profit, probs=0.75), DAG=Q3-Q1)
d
## # A tibble: 16 x 5
## `Sub-Category` Q1 Median Q3 DAG
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 Accessories 16.2 57.6 87.9 71.7
## 2 Appliances -20.3 10.9 26.3 46.7
## 3 Art 1.36 2.63 7.38 6.02
## 4 Binders -4.92 2.34 9.88 14.8
## 5 Bookcases -2.10 8.50 60.4 62.5
## 6 Chairs -25.2 10.3 63.6 88.7
## 7 Envelopes 5.34 7.60 8.40 3.06
## 8 Fasteners 1.96 3.60 5.40 3.45
## 9 Furnishings -4.58 5.53 14.9 19.5
## 10 Labels 3.88 6.12 16.9 13.0
## 11 Machines -16.4 9.00 216. 232.
## 12 Paper 5.26 8.86 25.3 20.0
## 13 Phones 2.02 21.1 35.8 33.8
## 14 Storage 2.73 7.91 22.7 20.0
## 15 Supplies 3.60 11.6 15.4 11.8
## 16 Tables -95.7 -0.889 24.6 120.
plot(a$Region,a$Median, xlab="Region", ylab="Ortanca", main="Ortanca izi cizimi")
plot(b$Category,b$Median, xlab="Category", ylab="Ortanca", main="Ortanca izi cizimi")
plot(c$`Ship Mode`,c$Median, xlab="Ship Mode", ylab="Ortanca", main="Ortanca izi cizimi")
plot(d$`Sub-Category`,d$Median, xlab="Sub-Category", ylab="Ortanca", main="Ortanca izi cizimi")
ggplot(a, aes(x=Median,y=DAG, color=Region, group=1))+
geom_point(size=4,alpha=0.6)+
geom_line(color="black")
Varyanslar homojen değil dönüşüme ihtiyaç var.
ggplot(b, aes(x=Median,y=DAG, color=Category, group=1))+
geom_point(size=4,alpha=0.6)+
geom_line(color="black")
Varyanslar homojen değil dönüşüme ihtiyaç var.
ggplot(c, aes(x=Median,y=DAG, color=`Ship Mode`, group=1))+
geom_point(size=4,alpha=0.6)+
geom_line(color="black")
Varyanslar homojen değil dönüşüme ihtiyaç var.
ggplot(d, aes(x=Median,y=DAG, color=`Sub-Category`, group=1))+
geom_point(size=4,alpha=0.6)+
geom_line(color="black")
Varyanslar homojen değil dönüşüme ihtiyaç var.
etk_train<-tra%>%
group_by(Region,Category)%>%
summarise(Median=median(Profit))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
etk_train
## # A tibble: 12 x 3
## # Groups: Region [4]
## Region Category Median
## <fct> <fct> <dbl>
## 1 West Technology 30.8
## 2 West Office Supplies 6.47
## 3 West Furniture 10.1
## 4 Central Technology 15.5
## 5 Central Office Supplies 3.63
## 6 Central Furniture -26.1
## 7 East Technology 36.9
## 8 East Office Supplies 6.81
## 9 East Furniture 15.5
## 10 South Technology 29.9
## 11 South Office Supplies 6.13
## 12 South Furniture 8.50
ggplot(etk_train, aes(x = Category, y = Median,color=Region,group=Region)) +
geom_line() +
geom_point()
tra$sales_log<-log10(tra$Sales)
hist(tra$sales_log)
tra$sales_kok <- sqrt(tra$Sales)
hist(tra$sales_kok)
Sales değişkeni için log dönüşümünün yeterli olduğunu söyleyebiliriz. Log dönüşümünde normale yakınsadı.
tra$discount_log<-log10(tra$Discount + 1 - min(tra$Discount))
hist(tra$discount_log)
tra$discount_kok <- sqrt(tra$Discount + 1 - min(tra$Discount))
hist(tra$discount_kok)
Discount değişkeni için log dönüşümünü kullanmayı tercih ediyoruz.
tra$profit_log<-log10(tra$Profit + 1 - min(tra$Profit))
hist(tra$profit_log)
tra$profit_kok <- sqrt(tra$Profit + 1 - min(tra$Profit))
hist(tra$profit_kok)
tra$profit_kare <-(tra$Profit)^(-1)
hist(tra$profit_kare)
Profit değişkeni için de ters dönüşüm tercih ediyoruz.
tra$quantity_log<-log10(tra$Quantity)
hist(tra$quantity_log)
tra$quantity_kok <- sqrt(tra$Quantity)
hist(tra$quantity_kok)
Quantity değişkeni için log dönüşümünü uygulamayı tercih ediyoruz.
ggplot(tra, aes(sales_log,profit_kare,label=rownames(tra)))+
geom_point(size=1)+
geom_text(label=rownames(tra),nudge_x=0.04,check_overlap=T,size=2.5)+
geom_smooth(method = "loess", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
ggplot(tra, aes(discount_log,profit_kare,label=rownames(tra)))+
geom_point(size=1)+
geom_text(label=rownames(tra),nudge_x=0.04,check_overlap=T,size=2.5)+
geom_smooth(method = "loess", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
ggplot(tra, aes(quantity_log,profit_kare , label=rownames(tra)))+
geom_point(size=1)+
geom_text(label=rownames(tra),nudge_x=0.04,check_overlap=T,size=2.5)+
geom_smooth(method = "loess", col="darkblue",se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
lower_bound_quantity_log <- median(tra$quantity_log) - 3 * mad(tra$quantity_log, constant = 1)
lower_bound_quantity_log
## [1] -0.05115252
upper_bound_quantity_log <- median(tra$quantity_log) + 3 * mad(tra$quantity_log, constant = 1)
upper_bound_quantity_log
## [1] 1.005395
outlier_ind_quantity_log <- which(tra$quantity_log < lower_bound_discount | tra$quantity_log > upper_bound_discount)
outlier_ind_quantity_log
## [1] 8 16 21 22 25 27 31 34 55 61 74 76 89 91 133 160 164 176 177
## [20] 184 187 198 200 205 215 227 230 243 252 253 259 266 267 268 269 289 295 298
## [39] 299 311 313
lower_bound_sales_log <- median(tra$sales_log) - 3 * mad(tra$sales_log, constant = 1)
lower_bound_sales_log
## [1] -0.007087954
upper_bound_sales_log <- median(tra$sales_log) + 3 * mad(tra$sales_log, constant = 1)
upper_bound_sales_log
## [1] 3.407871
outlier_ind_sales_log <- which(tra$sales_log < lower_bound_sales_log | tra$sales_log > upper_bound_sales_log)
outlier_ind_sales_log
## [1] 63 129 164 181
lower_bound_discount_log <- median(tra$discount_log) - 3 * mad(tra$discount_log, constant = 1)
lower_bound_discount_log
## [1] -0.1583625
upper_bound_discount_log <- median(tra$discount_log) + 3 * mad(tra$discount_log, constant = 1)
upper_bound_discount_log
## [1] 0.316725
outlier_ind_discount_log <- which(tra$discount_log < lower_bound_discount_log | tra$discount_log > upper_bound_discount_log)
outlier_ind_discount_log
## integer(0)
lower_bound_profit_kare <- median(tra$profit_kare) - 3 * mad(tra$profit_kare, constant = 1)
lower_bound_profit_kare
## [1] -0.1548746
upper_bound_profit_kare <- median(tra$profit_kare) + 3 * mad(tra$profit_kare, constant = 1)
upper_bound_profit_kare
## [1] 0.2412646
outlier_ind_profit_kare <- which(tra$profit_kare < lower_bound_profit_kare | tra$profit_kare > upper_bound_profit_kare)
outlier_ind_profit_kare
## [1] 6 7 9 11 15 17 18 26 33 45 50 52 54 66 67 74 77 80 83
## [20] 85 87 96 98 101 103 107 111 112 121 128 131 134 138 142 143 150 151 152
## [39] 153 154 157 161 163 175 178 185 191 193 202 204 207 209 211 214 216 221 223
## [58] 224 238 245 251 256 260 272 274 279 281 283 285 286 296 304 306 309
Dönüşümler aykırı/uç değerleri önemli ölçüde azalttığını görüyoruz. Etkili değer olabilecekleri için çıkarmamayı tercih ediyoruz.
Sales değişkenimizi merkezileştiriyoruz.
mean_sales<-mean(tra$Sales)
tra$sales_merkez<-(tra$Profit-mean_sales)
ggplot(tra, aes(x = sales_merkez, y =profit_kare )) +
stat_smooth(method = "lm", se = FALSE, color = "green", formula = y ~ x) +
stat_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x + I(x ^ 2)) +
stat_smooth(method = "lm", se = FALSE, color = "red", formula = y ~ x + I(x ^ 2)+ I(x ^ 3)) +
geom_point(colour = "black", size = 1)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
Kırmızı çizgimiz noktalarla en iyi uyuşan çizgidir. Kübik dönüşüm gerekmektedir.
loglu sales’i merkezleştirip karesel terimlere bakıyoruz
mean_saleslog<-mean(tra$sales_log)
tra$sales_log_merkez<-(tra$sales_log-mean_saleslog)
ggplot(tra, aes(x = sales_log_merkez, y =profit_kare )) +
stat_smooth(method = "lm", se = FALSE, color = "green", formula = y ~ x) +
stat_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x + I(x ^ 2)) +
stat_smooth(method = "lm", se = FALSE, color = "red", formula = y ~ x + I(x ^ 2)+ I(x ^ 3)) +
geom_point(colour = "black", size = 1)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
ggplot(tra, aes(x = sales_log_merkez, y =profit_kare )) +
stat_smooth(method = "lm", se = FALSE, color = "magenta", formula = y ~ x) +
stat_smooth(method = "lm", se = FALSE, color = "green", formula = y ~ x + I(x ^ 2)) +
stat_smooth(method = "lm", se = FALSE, color = "cyan", formula = y ~ x + I(x ^ 2)+ I(x ^ 3)) +
geom_point(colour = "black", size = 1)
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
## Removed 1 rows containing non-finite values (stat_smooth).
Tukey’s Ladder
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.1.3
## Registered S3 method overwritten by 'DescTools':
## method from
## plot.bagplot aplpack
##
## Attaching package: 'rcompanion'
## The following object is masked from 'package:psych':
##
## phi
profit_tukey<-transformTukey(tra$Profit + 1 - min(tra$Profit),plotit=FALSE)
##
## lambda W Shapiro.p.value
## 482 2.025 0.3295 1.575e-32
##
## if (lambda > 0){TRANS = x ^ lambda}
## if (lambda == 0){TRANS = log(x)}
## if (lambda < 0){TRANS = -1 * x ^ lambda}
profit_tukey<- transformTukey(tra$Sales, plotit=FALSE)
##
## lambda W Shapiro.p.value
## 398 -0.075 0.9881 0.01017
##
## if (lambda > 0){TRANS = x ^ lambda}
## if (lambda == 0){TRANS = log(x)}
## if (lambda < 0){TRANS = -1 * x ^ lambda}
Profit için
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
Box_profit<- boxcox(tra$Profit + 1 - min(tra$Profit) ~ 1,
lambda = seq(-6,6,0.1))
Cox_profit<- data.frame(Box_profit$x, Box_profit$y)
Cox_profit <- Cox_profit[order(-Cox_profit$Box_profit.y),]
Cox_profit[1,]
## Box_profit.x Box_profit.y
## 81 2 2.6096
lambda <- Cox_profit[1, "Box_profit.x"]
lambda
## [1] 2
Sales için
library(MASS)
Box_sales<- boxcox(tra$Sales ~ 1,
lambda = seq(-6,6,0.1))
Cox_sales<- data.frame(Box_sales$x, Box_sales$y)
Cox_sales <- Cox_sales[order(-Cox_sales$Box_sales.y),]
Cox_sales[1,]
## Box_sales.x Box_sales.y
## 60 -0.1 -1088.712
lambda <- Cox_sales[1, "Box_sales.x"]
lambda
## [1] -0.1
Ham hali üzerinden saçılım matrisi
Çarpıklık gözlemi
orj<-tra[,c(9,10,11,12)]
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.1.3
## Zorunlu paket yükleniyor: xts
## Warning: package 'xts' was built under R version 4.1.3
## Zorunlu paket yükleniyor: zoo
## Warning: package 'zoo' was built under R version 4.1.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(orj, histogram=TRUE, pch=19)
transform_train<-tra[,c(15,21,17,19)]
chart.Correlation(transform_train, histogram=TRUE, pch=19)
# Birliktelik İstatistikleri
dt1<-table(tra$Category,tra$Region)
prop.table(dt1,2)
##
## West Central East South
## Technology 0.1978022 0.1500000 0.2197802 0.2413793
## Office Supplies 0.5824176 0.6125000 0.5494505 0.6379310
## Furniture 0.2197802 0.2375000 0.2307692 0.1206897
round(100*prop.table(dt1,2), 2)
##
## West Central East South
## Technology 19.78 15.00 21.98 24.14
## Office Supplies 58.24 61.25 54.95 63.79
## Furniture 21.98 23.75 23.08 12.07
addmargins(round(prop.table(dt1,2), 2),1)
##
## West Central East South
## Technology 0.20 0.15 0.22 0.24
## Office Supplies 0.58 0.61 0.55 0.64
## Furniture 0.22 0.24 0.23 0.12
## Sum 1.00 1.00 1.00 1.00
prop.table(data.matrix(rowsum(…)), 1)
dt1<-table(tra$Category,tra$Region)
prop.table(data.matrix(rowsum(2,1)), 1)
## [,1]
## 1 1
round(100*prop.table(dt1,2), 2)
##
## West Central East South
## Technology 19.78 15.00 21.98 24.14
## Office Supplies 58.24 61.25 54.95 63.79
## Furniture 21.98 23.75 23.08 12.07
library("gplots")
## Warning: package 'gplots' was built under R version 4.1.3
## Registered S3 method overwritten by 'gplots':
## method from
## reorder.factor DescTools
##
## Attaching package: 'gplots'
## The following object is masked from 'package:PerformanceAnalytics':
##
## textplot
## The following object is masked from 'package:stats':
##
## lowess
balloonplot(t(dt1), main ="Category ve Region ", xlab ="", ylab="",
label = FALSE,show.margins = FALSE)
Office Supplies kategorisi her bölgede çok daha fazladır.
dt2<-table(tra$`Sub-Category`,tra$Region)
prop.table(dt2,2)
##
## West Central East South
## Accessories 0.08791209 0.07500000 0.08791209 0.12068966
## Appliances 0.02197802 0.06250000 0.02197802 0.06896552
## Art 0.13186813 0.06250000 0.07692308 0.06896552
## Binders 0.09890110 0.15000000 0.20879121 0.15517241
## Bookcases 0.01098901 0.00000000 0.01098901 0.01724138
## Chairs 0.12087912 0.10000000 0.06593407 0.03448276
## Copiers 0.00000000 0.00000000 0.00000000 0.00000000
## Envelopes 0.00000000 0.00000000 0.04395604 0.01724138
## Fasteners 0.04395604 0.01250000 0.00000000 0.03448276
## Furnishings 0.03296703 0.10000000 0.13186813 0.05172414
## Labels 0.07692308 0.03750000 0.02197802 0.06896552
## Machines 0.01098901 0.01250000 0.01098901 0.03448276
## Paper 0.12087912 0.17500000 0.08791209 0.12068966
## Phones 0.09890110 0.06250000 0.12087912 0.08620690
## Storage 0.06593407 0.10000000 0.07692308 0.08620690
## Supplies 0.02197802 0.01250000 0.01098901 0.01724138
## Tables 0.05494505 0.03750000 0.02197802 0.01724138
round(100*prop.table(dt2,2), 2)
##
## West Central East South
## Accessories 8.79 7.50 8.79 12.07
## Appliances 2.20 6.25 2.20 6.90
## Art 13.19 6.25 7.69 6.90
## Binders 9.89 15.00 20.88 15.52
## Bookcases 1.10 0.00 1.10 1.72
## Chairs 12.09 10.00 6.59 3.45
## Copiers 0.00 0.00 0.00 0.00
## Envelopes 0.00 0.00 4.40 1.72
## Fasteners 4.40 1.25 0.00 3.45
## Furnishings 3.30 10.00 13.19 5.17
## Labels 7.69 3.75 2.20 6.90
## Machines 1.10 1.25 1.10 3.45
## Paper 12.09 17.50 8.79 12.07
## Phones 9.89 6.25 12.09 8.62
## Storage 6.59 10.00 7.69 8.62
## Supplies 2.20 1.25 1.10 1.72
## Tables 5.49 3.75 2.20 1.72
addmargins(round(prop.table(dt2,2), 2),1)
##
## West Central East South
## Accessories 0.09 0.07 0.09 0.12
## Appliances 0.02 0.06 0.02 0.07
## Art 0.13 0.06 0.08 0.07
## Binders 0.10 0.15 0.21 0.16
## Bookcases 0.01 0.00 0.01 0.02
## Chairs 0.12 0.10 0.07 0.03
## Copiers 0.00 0.00 0.00 0.00
## Envelopes 0.00 0.00 0.04 0.02
## Fasteners 0.04 0.01 0.00 0.03
## Furnishings 0.03 0.10 0.13 0.05
## Labels 0.08 0.04 0.02 0.07
## Machines 0.01 0.01 0.01 0.03
## Paper 0.12 0.17 0.09 0.12
## Phones 0.10 0.06 0.12 0.09
## Storage 0.07 0.10 0.08 0.09
## Supplies 0.02 0.01 0.01 0.02
## Tables 0.05 0.04 0.02 0.02
## Sum 0.99 0.98 1.00 1.01
dt3<-table(tra$Segment,tra$Region)
prop.table(dt3,2)
##
## West Central East South
## Consumer 0.4505495 0.5000000 0.5824176 0.4655172
## Home Office 0.1868132 0.1875000 0.1428571 0.2413793
## Corporate 0.3626374 0.3125000 0.2747253 0.2931034
round(100*prop.table(dt3,2), 2)
##
## West Central East South
## Consumer 45.05 50.00 58.24 46.55
## Home Office 18.68 18.75 14.29 24.14
## Corporate 36.26 31.25 27.47 29.31
addmargins(round(prop.table(dt3,2), 2),1)
##
## West Central East South
## Consumer 0.45 0.50 0.58 0.47
## Home Office 0.19 0.19 0.14 0.24
## Corporate 0.36 0.31 0.27 0.29
## Sum 1.00 1.00 0.99 1.00
library("gplots")
balloonplot(t(dt3), main ="Segment ve Region ", xlab ="", ylab="",
label = FALSE,show.margins = FALSE)
Consumer segmenti her bölgede daha baskın gelmektedir.
dt_c<-table(tra$Region,tra$Region)
dtc_exp <- chisq.test(dt_c)$expected
rowcs <- function(i, obs, exp) {
sum(((obs[i,] - exp[i,])^2)/exp[i,])
}
chi_dtc<-as.matrix(lapply(seq_len(nrow(dt_c)), rowcs, obs = dt_c, exp = dtc_exp))
rownames(chi_dtc)<-rownames(dt_c)
chi_dtc
## [,1]
## West 229
## Central 240
## East 229
## South 262
library(inspectdf)
## Warning: package 'inspectdf' was built under R version 4.1.3
library(dplyr)
tra %>% inspect_types()
## # A tibble: 2 x 4
## type cnt pcnt col_name
## <chr> <int> <dbl> <named list>
## 1 numeric 15 60 <chr [15]>
## 2 factor 10 40 <chr [10]>
tra_cat<-tra %>% inspect_cat()
tra_cat$levels$Region
## # A tibble: 4 x 3
## value prop cnt
## <chr> <dbl> <int>
## 1 East 0.284 91
## 2 West 0.284 91
## 3 Central 0.25 80
## 4 South 0.181 58
tra_cat %>% show_plot()
library(readxl)
test <- read_excel("C:/Users/SİMAY/Desktop/Lessons/Veri Analizi/verianalizi_proje/test.xlsx")
View(test)
test$`Ship Mode` <- factor(test$`Ship Mode`, levels=c("Second Class","Standard Class","First Class","Same Day"))
test$Segment <- factor(test$Segment, levels=c ("Consumer","Home Office","Corporate"))
test$Country <- factor(test$Country, levels=c ("United States"))
test$Country <- factor(test$Country, levels=c ("United States"))
test$Region <- factor(test$Region, levels=c("West","Central","East","South"))
test$Region <- factor(test$Region, levels=c("West","Central","East","South"))
test$Category <- factor(test$Category, levels=c("Technology","Office Supplies","Furniture"))
test$`Sub-Category` <- as.factor(test$`Sub-Category`)
test$City <- as.factor(test$City)
test$State <- as.factor(test$State)
Quantity <- as.numeric(test$Quantity)
Discount <- as.numeric(test$Discount)
Profit <- as.numeric(test$Profit)
Sales <- as.numeric(test$Sales)
summary(test)
## Ship Mode Segment Country City
## Second Class :16 Consumer :46 United States:80 New York City: 5
## Standard Class:43 Home Office:12 Seattle : 5
## First Class :17 Corporate :22 Houston : 4
## Same Day : 4 Chicago : 3
## Los Angeles : 3
## Philadelphia : 3
## (Other) :57
## State Region Category Sub-Category
## California:18 West :29 Technology :11 Paper :15
## New York : 8 Central:17 Office Supplies:51 Binders :13
## Texas : 8 East :23 Furniture :18 Appliances : 9
## Illinois : 5 South :11 Furnishings: 9
## Ohio : 5 Chairs : 7
## Washington: 5 Phones : 5
## (Other) :31 (Other) :22
## Sales Quantity Discount Profit
## Min. : 2.264 Min. : 1.000 Min. :0.0000 Min. :-559.356
## 1st Qu.: 18.549 1st Qu.: 2.000 1st Qu.:0.0000 1st Qu.: 3.015
## Median : 44.046 Median : 3.000 Median :0.2000 Median : 10.326
## Mean : 408.921 Mean : 4.088 Mean :0.1663 Mean : 109.089
## 3rd Qu.: 278.036 3rd Qu.: 5.000 3rd Qu.:0.2000 3rd Qu.: 36.824
## Max. :13999.960 Max. :11.000 Max. :0.8000 Max. :6719.981
##
summary(test)
## Ship Mode Segment Country City
## Second Class :16 Consumer :46 United States:80 New York City: 5
## Standard Class:43 Home Office:12 Seattle : 5
## First Class :17 Corporate :22 Houston : 4
## Same Day : 4 Chicago : 3
## Los Angeles : 3
## Philadelphia : 3
## (Other) :57
## State Region Category Sub-Category
## California:18 West :29 Technology :11 Paper :15
## New York : 8 Central:17 Office Supplies:51 Binders :13
## Texas : 8 East :23 Furniture :18 Appliances : 9
## Illinois : 5 South :11 Furnishings: 9
## Ohio : 5 Chairs : 7
## Washington: 5 Phones : 5
## (Other) :31 (Other) :22
## Sales Quantity Discount Profit
## Min. : 2.264 Min. : 1.000 Min. :0.0000 Min. :-559.356
## 1st Qu.: 18.549 1st Qu.: 2.000 1st Qu.:0.0000 1st Qu.: 3.015
## Median : 44.046 Median : 3.000 Median :0.2000 Median : 10.326
## Mean : 408.921 Mean : 4.088 Mean :0.1663 Mean : 109.089
## 3rd Qu.: 278.036 3rd Qu.: 5.000 3rd Qu.:0.2000 3rd Qu.: 36.824
## Max. :13999.960 Max. :11.000 Max. :0.8000 Max. :6719.981
##
Sales değişkeni için logaritmik dönüşüm
test$sales_log<-log10(test$Sales)
Discount değişkeni için logaritmik dönüşüm
test$discount_log<-log10(test$Discount + 1 - min(test$Discount))
profit değişkeni için karesel
test$profit_kare<-(test$Profit)^(-1)
quantity değişkeni için log dönüşümü
test$quantity_log<-log10(test$Quantity)
test$sales_log_merkez<-(test$sales_log-mean(test$sales_log))
fit1<-lm(Profit ~ Sales+Region+Category, data=tra)
summary(fit1)
##
## Call:
## lm(formula = Profit ~ Sales + Region + Category, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3928.2 -12.8 -0.5 33.8 1221.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.67877 39.46675 0.777 0.438
## Sales 0.13181 0.02722 4.843 2.02e-06 ***
## RegionCentral -76.12399 37.46741 -2.032 0.043 *
## RegionEast -3.76213 36.29057 -0.104 0.918
## RegionSouth -23.09435 41.23675 -0.560 0.576
## CategoryOffice Supplies -15.26947 36.12577 -0.423 0.673
## CategoryFurniture -32.78026 42.94763 -0.763 0.446
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 244.2 on 313 degrees of freedom
## Multiple R-squared: 0.09003, Adjusted R-squared: 0.07258
## F-statistic: 5.161 on 6 and 313 DF, p-value: 4.452e-05
fit1_1<-lm(Profit ~ Sales+Region, data=tra)
summary(fit1_1)
##
## Call:
## lm(formula = Profit ~ Sales + Region, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3927.2 -13.4 -2.8 40.0 1219.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.46371 26.58628 0.544 0.5868
## Sales 0.13223 0.02644 5.000 9.52e-07 ***
## RegionCentral -77.14647 37.35287 -2.065 0.0397 *
## RegionEast -3.58490 36.17903 -0.099 0.9211
## RegionSouth -20.65248 41.01398 -0.504 0.6149
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 243.6 on 315 degrees of freedom
## Multiple R-squared: 0.08832, Adjusted R-squared: 0.07675
## F-statistic: 7.629 on 4 and 315 DF, p-value: 7.053e-06
##Tahmin
predictions <- predict(fit1_1, test) #test uzerınden
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## The following object is masked from 'package:survival':
##
## cluster
#train:
round(defaultSummary(data.frame(obs=tra$Profit,pred=predict(fit1_1,tra))),3)
## RMSE Rsquared MAE
## 241.701 0.088 57.465
#merkezilestirilmis uzerinden
library(DataCombine)
## Warning: package 'DataCombine' was built under R version 4.1.3
tra[is.na(tra) | tra == "Inf"] = NA
dn <- DropNA(tra)
## No Var specified. Dropping all NAs from the data frame.
## 1 rows dropped from the data frame because of missing values.
#test:
round(defaultSummary(data.frame(obs=test$Profit,pred=predict(fit1_1,test))),2)
## RMSE Rsquared MAE
## 552.27 0.89 108.11
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.1.3
ggplot2::autoplot(fit1_1)
## Modelleme - polinomial
fit2<-lm(profit_kare ~ sales_log_merkez + I(sales_log_merkez^2)+I(sales_log_merkez^3)+Region+Category , data = tra)
summary(fit2)
##
## Call:
## lm(formula = profit_kare ~ sales_log_merkez + I(sales_log_merkez^2) +
## I(sales_log_merkez^3) + Region + Category, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6644 -0.0714 0.0020 0.0766 4.5973
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.037640 0.071886 0.524 0.6009
## sales_log_merkez 0.051179 0.070785 0.723 0.4702
## I(sales_log_merkez^2) 0.244900 0.043044 5.689 2.95e-08 ***
## I(sales_log_merkez^3) -0.190157 0.044085 -4.313 2.16e-05 ***
## RegionCentral -0.158039 0.066153 -2.389 0.0175 *
## RegionEast -0.081299 0.064598 -1.259 0.2091
## RegionSouth -0.115977 0.073213 -1.584 0.1142
## CategoryOffice Supplies 0.094066 0.072365 1.300 0.1946
## CategoryFurniture 0.008236 0.075943 0.108 0.9137
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4299 on 310 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1914, Adjusted R-squared: 0.1705
## F-statistic: 9.169 on 8 and 310 DF, p-value: 2.475e-11
fit2<-lm(profit_kare ~ sales_log +Region, data = tra)
summary(fit2)
##
## Call:
## lm(formula = profit_kare ~ sales_log + Region, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3737 -0.1003 -0.0238 0.0725 5.4821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.50052 0.07808 6.410 5.33e-10 ***
## sales_log -0.17862 0.03451 -5.176 4.05e-07 ***
## RegionCentral -0.15938 0.06958 -2.291 0.0226 *
## RegionEast -0.04076 0.06732 -0.605 0.5453
## RegionSouth -0.07085 0.07623 -0.929 0.3534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4526 on 314 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.09206, Adjusted R-squared: 0.0805
## F-statistic: 7.96 on 4 and 314 DF, p-value: 4.015e-06
fit2_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit2,tra)))))
rownames(fit2_res)<-"fit2"
fit3<-lm(profit_kare ~ sales_log_merkez + I(sales_log_merkez^2)+ I(sales_log_merkez^3)+Region+Region*sales_log_merkez , data = tra)
summary(fit3)
##
## Call:
## lm(formula = profit_kare ~ sales_log_merkez + I(sales_log_merkez^2) +
## I(sales_log_merkez^3) + Region + Region * sales_log_merkez,
## data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7048 -0.0631 -0.0086 0.0559 4.5709
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.097684 0.049754 1.963 0.050505 .
## sales_log_merkez -0.077295 0.084982 -0.910 0.363770
## I(sales_log_merkez^2) 0.237325 0.044240 5.364 1.59e-07 ***
## I(sales_log_merkez^3) -0.162955 0.043256 -3.767 0.000198 ***
## RegionCentral -0.159020 0.066007 -2.409 0.016575 *
## RegionEast -0.085244 0.064469 -1.322 0.187062
## RegionSouth -0.110497 0.072848 -1.517 0.130335
## sales_log_merkez:RegionCentral 0.162768 0.091109 1.787 0.074993 .
## sales_log_merkez:RegionEast 0.008373 0.088434 0.095 0.924632
## sales_log_merkez:RegionSouth 0.109622 0.100730 1.088 0.277324
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4291 on 309 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.197, Adjusted R-squared: 0.1736
## F-statistic: 8.424 on 9 and 309 DF, p-value: 2.919e-11
fit3_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit3,tra)))))
rownames(fit3_res)<-"fit3"
fit4<-lm(profit_kare ~ Sales+Region+Category, data = tra)
summary(fit4)
##
## Call:
## lm(formula = profit_kare ~ Sales + Region + Category, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3424 -0.1397 -0.0331 0.0498 5.6699
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.226e-02 7.495e-02 1.098 0.27323
## Sales -5.786e-05 5.163e-05 -1.121 0.26332
## RegionCentral -1.562e-01 7.123e-02 -2.193 0.02905 *
## RegionEast -2.747e-02 6.901e-02 -0.398 0.69085
## RegionSouth -7.390e-02 7.831e-02 -0.944 0.34608
## CategoryOffice Supplies 1.855e-01 6.849e-02 2.709 0.00712 **
## CategoryFurniture 2.525e-02 8.171e-02 0.309 0.75751
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4629 on 312 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.05642, Adjusted R-squared: 0.03828
## F-statistic: 3.109 on 6 and 312 DF, p-value: 0.005669
fit4<-lm(profit_kare ~ Sales+Region, data = tra)
summary(fit4)
##
## Call:
## lm(formula = profit_kare ~ Sales + Region, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3295 -0.1303 -0.0633 0.0307 5.7403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.075e-01 5.147e-02 4.032 6.96e-05 ***
## Sales -9.562e-05 5.091e-05 -1.878 0.0613 .
## RegionCentral -1.528e-01 7.208e-02 -2.120 0.0348 *
## RegionEast -3.744e-02 6.983e-02 -0.536 0.5922
## RegionSouth -7.089e-02 7.912e-02 -0.896 0.3710
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4689 on 314 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.02553, Adjusted R-squared: 0.01312
## F-statistic: 2.057 on 4 and 314 DF, p-value: 0.08632
fit4_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit4,tra)))))
rownames(fit4_res)<-"fit4"
fit5<-lm(profit_kare ~ Sales, data = tra)
summary(fit5)
##
## Call:
## lm(formula = profit_kare ~ Sales, data = tra)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3754 -0.1110 -0.0683 0.0329 5.7652
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.451e-01 2.879e-02 5.040 7.84e-07 ***
## Sales -9.339e-05 5.092e-05 -1.834 0.0676 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4703 on 317 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0105, Adjusted R-squared: 0.007379
## F-statistic: 3.364 on 1 and 317 DF, p-value: 0.06758
fit5_res<-as.data.frame(t(defaultSummary(data.frame(obs=tra$profit_kare,pred=predict(fit5,tra)))))
rownames(fit5_res)<-"fit5"
#test icin:
fit2_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit2,test)))))
rownames(fit2_res_test)<-"fit2"
fit3_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit3,test)))))
rownames(fit3_res_test)<-"fit3"
fit4_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit4,test)))))
rownames(fit4_res_test)<-"fit4"
fit5_res_test<-as.data.frame(t(defaultSummary(data.frame(obs=test$profit_kare,pred=predict(fit5,test)))))
rownames(fit5_res_test)<-"fit5"
round(rbind(fit2_res_test,fit3_res_test,fit4_res_test,fit5_res_test),2)
## RMSE Rsquared MAE
## fit2 0.18 0.18 0.13
## fit3 0.24 0.09 0.14
## fit4 0.21 0.04 0.15
## fit5 0.22 0.01 0.15
list2<-list(fit2,fit3,fit4,fit5)
PRESS <- function(linmodel) { pr <- residuals(linmodel)/(1 - lm.influence(linmodel)$hat)
sum(pr^2)
}
for (i in list2) {
print(paste("Press:",round(PRESS(i),3)))
}
## [1] "Press: 67.076"
## [1] "Press: 69.091"
## [1] "Press: 70.856"
## [1] "Press: 70.733"
library(ggfortify)
autoplot(fit2)
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.3
cart<-rpart(profit_kare~sales_log+Region+Category , data=tra)
cart$variable.importance
## sales_log Region Category
## 13.5219358 1.5659918 0.6717293
prp(cart, type=5)
Yaptığımız analizler sonucunda kârı artırmak için yüksek kâr getiren “Copiers” yani fotokopi makinelerine ağırlık verilmesi gerektiğini gözlemliyoruz. Özellikle bu ağırlığın merkez bölge için daha dazla özelleştirilmesi gerektiğini söyleyebiliriz. En büyük zararı ise Doğu bölgesinde ” Machines” için gözlemledik, fakat zaman zaman makineler doğu bölgesinde kâr sağlamıştır. En yüksek zarara neden olan makinenin hangisi olduğunu tespit edip bunu düzeltmek üzerine yoğunlaşabiliriz. Ve yine Doğu bölgesinde “Supplies” değişkeninin sadece zarara uğrattığını gözlemleyebiliriz. Mobilyalar için ise “Tables” değişkeni Güney bölgesinde zarara uğratmıştır. “Bookcases” değişkeni ise yine Doğu bölgesinde zarar getirmiştir. Modellerden ise en az hatayı ve en yüksek R^2 değerini veren 2.modeli seçiyoruz.